##Create a map of the US Coastline and plot landfall points
## ID Name state.name x y Max.Wind Year
## 1 AL011900 UNNAMED texas -95.17883 29.16898 119.35583 1900
## 1008 AL031903 UNNAMED florida -85.58116 30.06231 79.84683 1903
## 1775 AL061906 UNNAMED mississippi -88.68922 30.37844 94.96951 1906
## 1864 AL081906 UNNAMED florida -80.79391 25.15913 101.75561 1906
## 2405 AL021909 UNNAMED texas -97.21436 26.10459 84.97311 1909
## 2463 AL041909 UNNAMED texas -95.36640 28.94130 97.46856 1909
## Month cat
## 1 09 3
## 1008 09 1
## 1775 09 1
## 1864 10 2
## 2405 06 1
## 2463 07 2
geography <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray95"),
subunitcolor = toRGB("gray85"),
countrycolor = toRGB("gray85"),
countrywidth = 0.5,
subunitwidth = 0.5
)
storm_colors <- c("blue", "green", "yellow", "orange", "red")
storm_plot <- plot_geo(storm_landfall, lat = ~y, lon = ~x) %>%
add_markers(
text = ~paste(Year,"<br />",Name,"<br />","Cat:",cat),hoverinfo="text",
color = ~storm_landfall$cat, size = ~storm_landfall$Max.Wind,
colors = storm_colors,
marker = list(colorbar = list(len = 0.2, title = "Landfall Category"))
) %>%
layout(
title = 'US Hurricane Landfall points', geo = geography
)
storm_plot
storm_landfall1<-subset(storm_landfall,state.name %in% c("florida","louisiana","texas","mississippi", "alabama", "south carolina", "north carolina"))
options(warn = -1)
storm_box <- plot_ly(storm_landfall1, y = ~Max.Wind, color = ~state.name, type = "box") %>%
layout(title='Max Wind speed at landfall', yaxis = list(title = 'Max Wind speed [mph]'))
storm_box
# test linear regression against random forest
lm_model <- lm(storm_num ~ Time+Category+Latitude+Longitude+Maximum.Wind+Year+Month+timestamp , data = train_set)
rf_model = randomForest(storm_num ~ Category+Latitude+Longitude+Maximum.Wind+Year+Month+timestamp, data = train_set, ntree = 5)
test_set$scores = predict(rf_model, newdata = test_set)
summary(test_set)
## ID Name Time Category
## Length:11278 Length:11278 Length:11278 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Mode :character Median :0.0000
## Mean :0.2757
## 3rd Qu.:0.0000
## Max. :5.0000
## Latitude Longitude Maximum.Wind Minimum.Pressure
## Min. : 7.20 Min. :-108.90 Min. :-99.00 Min. :-999.00
## 1st Qu.:18.60 1st Qu.: -80.60 1st Qu.: 30.00 1st Qu.:-999.00
## Median :26.10 Median : -66.50 Median : 45.00 Median :-999.00
## Mean :26.71 Mean : -64.73 Mean : 49.41 Mean : -78.72
## 3rd Qu.:32.90 3rd Qu.: -50.73 3rd Qu.: 65.00 3rd Qu.: 997.00
## Max. :75.50 Max. : 0.00 Max. :160.00 Max. :1024.00
## Year Month timestamp
## Min. :1900 Length:11278 Min. :1900-08-28 06:00:00
## 1st Qu.:1939 Class :character 1st Qu.:1939-08-18 19:30:00
## Median :1969 Mode :character Median :1969-09-20 09:00:00
## Mean :1966 Mean :1966-03-09 10:59:12
## 3rd Qu.:1995 3rd Qu.:1995-09-21 04:30:00
## Max. :2015 Max. :2015-11-13 12:00:00
## YYYYMM storm_num scores
## Min. :190008 Min. : 1.000 Min. : 1.000
## 1st Qu.:193908 1st Qu.: 4.000 1st Qu.: 4.000
## Median :196909 Median : 7.000 Median : 7.041
## Mean :196559 Mean : 8.005 Mean : 8.015
## 3rd Qu.:199509 3rd Qu.:11.000 3rd Qu.:11.076
## Max. :201511 Max. :31.000 Max. :31.000
#get residuals
lm_exp <- DALEX::explain(lm_model, label = "lm", data = train_set, y = train_set$storm_num)
## Preparation of a new explainer is initiated
## -> model label : lm
## -> data : 28549 rows 13 cols
## -> target variable : 28549 values
## -> predict function : yhat.lm will be used ( [33m default [39m )
## -> predicted values : numerical, min = -3.417257 , mean = 8.023048 , max = 25
## -> model_info : package stats , ver. 3.6.3 , task regression ( [33m default [39m )
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -11.96225 , mean = -5.655027e-09 , max = 19.263
## [32m A new explainer has been created! [39m
rf_exp <- DALEX::explain(rf_model, label = "rf", data = train_set, y = train_set$storm_num)
## Preparation of a new explainer is initiated
## -> model label : rf
## -> data : 28549 rows 13 cols
## -> target variable : 28549 values
## -> predict function : yhat.randomForest will be used ( [33m default [39m )
## -> predicted values : numerical, min = 1 , mean = 8.024476 , max = 31
## -> model_info : package randomForest , ver. 4.6.14 , task regression ( [33m default [39m )
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -6.420833 , mean = -0.001428336 , max = 9.68
## [32m A new explainer has been created! [39m
lm_mr <- model_residual(lm_exp)
rf_mr <- model_residual(rf_exp)
# Compare accuracy of Random Forest vs. Linear Regression. Random Forest is more accurate
plot(rf_mr, lm_mr, type = "prediction", abline = TRUE)
predictions <- rf_model %>% predict(test_set)
RMSE <- RMSE(predictions, test_set$storm_num)
R2 <- R2(predictions, test_set$storm_num)
# Print Root Mean Square Error of the prediction
RMSE
## [1] 1.05466
# Print R-Squared
R2
## [1] 0.963336